# Load packages
library(tidyverse)
library(janitor)
library(here)
library(rstan)
library(rstanarm)
library(bayesrules)
library(tidyverse)
library(rstanarm)
library(broom.mixed)
library(tidybayes)
library(bayesplot)
library(bayesrules)
library(tidyverse)
library(rstanarm)
library(broom.mixed)
library(tidybayes)
library(forcats)
library(bayesplot)
library(sf)
library(nycgeo)
library(tidycensus)
# themes
theme_set(theme_minimal())
brown_green <- c("#E9DBC2","#7D9B8A","#4D6F5C","#D29B5B","#744410","#1C432D")
color_scheme_set(brown_green)
nyc_join <- merge(nta_sf,nta_acs_data)
nyc_join <- nyc_join %>%
st_transform(., 4269)
county_list <- nyc_join %>% pull(county_name) %>% unique()
census_api_key("0cc07f06386e317f312adef5e0892b0d002b7254")
census_data <- get_acs(state = "NY",
county = c(county_list),
geography = "tract",
variables = c(gini_inequality ="B19083_001"),
year = 2019,
output = "wide",
survey = "acs5",
geometry = TRUE) %>%
dplyr::select(-c(NAME, ends_with("M"))) %>%
rename_at(vars(ends_with("E")), .funs = list(~str_sub(., end = -2))) %>%
st_transform(., 4269) %>%
dplyr::select(-GEOID)
##
|
| | 0%
|
|= | 1%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 18%
|
|============= | 18%
|
|===================== | 30%
|
|=========================== | 38%
|
|============================ | 39%
|
|============================= | 41%
|
|============================== | 42%
|
|=============================== | 44%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 53%
|
|======================================= | 55%
|
|======================================= | 56%
|
|========================================= | 58%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|=============================================== | 67%
|
|================================================ | 69%
|
|========================================================= | 82%
|
|=========================================================== | 84%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|================================================================== | 94%
|
|======================================================================| 100%
vari_names <- read_csv(here("clean_data", "nyc_names_vichy_complied2.csv"))
nyc_clean <- st_read(here("clean_data", "nyc_data_vichy_complied2.shp"))
## Reading layer `nyc_data_vichy_complied2' from data source
## `/Users/sding/Desktop/github/454/454Capstone/clean_data/nyc_data_vichy_complied2.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 224 features and 26 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.04731 ymin: 40.55685 xmax: -73.70002 ymax: 40.91758
## Geodetic CRS: NAD83
colnames(nyc_clean) <- colnames(vari_names)
gini_neighborhood <- st_join(nyc_clean, census_data, left = TRUE) %>%
group_by(nta_id) %>%
summarize(gini_neighborhood=median(gini_inequality, na.rm=T)) %>%
as.tibble() %>%
dplyr::select(nta_id, gini_neighborhood)
nyc_clean <- nyc_clean %>%
as.tibble() %>%
left_join(., gini_neighborhood, by="nta_id")%>%
unique() %>%
st_as_sf()
nyc_compiled <- nyc_clean %>%
mutate(asian_perc = asian_count / total_pop) %>%
mutate(white_perc = white_count / total_pop) %>%
mutate(black_perc = black_count / total_pop) %>%
mutate(latinx_perc = latinx_count / total_pop) %>%
mutate(native_perc = native_count / total_pop) %>%
mutate(noncitizen_perc = noncitizen_count / total_pop) %>%
mutate(evictions_perc = eviction_count / total_pop) %>%
mutate(uninsured_perc = uninsured_count / total_pop) %>%
mutate(unemployment_perc = unemployment_count / total_pop) %>%
mutate(below_poverty_line_perc = below_poverty_line_count / total_pop) %>%
mutate(transportation_desert_4cat =
factor(transportation_desert_4cat, levels=c("Poor", "Limited", "Satisfactory", "Excellent")))
All data used in this project are from two major sources: the Tidycensus package and NYC Open Data.
Tidycensus is an R package interface, developed by Kyle Walker and Matt Herman, that enables easy access to the US Census Bureau’s data APIs and returns Tidyverse-ready data frames from various major US Census Bureau datasets. Our demographic and socioeconomic data are drawn from the 2019 American Community Survey results found in Tidycensus package. A summary of our ACS data variables is below:
borough: NYC Boroughtotal_pop: Total Population Count by Census Tractmean_income: Mean Income by Census Tractbelow_poverty_line_count: Number of People Living Below the 100% Poverty Line by Census Tractmean_rent: Mean Rent by Censusunemployment_count: Number of People on Unemployment by Census Tractlatinx_count: Number of Latinx People by Census Tractwhite_count: Number of White People by Census Tractblack_count: Number of Black People by Census Tractnative_count: Number of Native People by Census Tractasian_count: Number of Asian People by Census Tractnaturalized_citizen_count: Number of Naturalized Citizens by Census Tractnoncitizen_count: Number of Foreign Born People by Census Tractuninsured_count: Number of Uninsured Citizens of any Age by Census Tractgini_neighborhood : Income inequality measured by the Gini Index per Census TractNote that these predictors are all measured at the census level. To aggregate these estimates at the neighborhood level, we performed two transformations:
Then, for count based demographic predictors, we divide by the total population in each neighborhood to define scaled demographic metrics. They are as follows:
asian_perc: Percentage of Asian People by Neighborhoodwhite_perc: Percentage of White People by Neighborhoodblack_perc: Percentage of Black People by Neighborhoodlatinx_perc: Percentage of Latinx People by Neighborhoodnative_perc: Percentage of Native People by Neighborhoodnoncitizen_perc: Percentage of Foreign Born People by Neighborhooduninsured_perc: Percentage of Uninsured Citizens of any Age by Neighborhoodunemployment_perc: Percentage of People on Unemployment by Neighborhoodbelow_poverty_line_perc: Percentage of people living below the 100% poverty line.We used NYC’s Open Data portal to collect information on the remaining predictors. In particular, we used geotagged locations of Subway Stops, Bus Stops, Grocery Stores, Schools, and Eviction Sites from the Departments of Transportation, Health, Education, and Housing, respectively, to calculate the following variables:
school_count: Public school counts by Neighborhoodeviction_count: Eviction counts by Neighborhoodstore_count: Grocery store and food vendor counts by Neighborhoodsub_count: Subway station counts by Neighborhoodbus_count: Bus station counts by Neighborhoodperc_covered_by_transit: Percent of Neighborhood Within Walking Distance (.5 miles) of Any Subway Stop.transportation_desert_4cat: Subway Accessibility by Neighborhood (Poor, Limited, Satisfactory, Excellent)The process involved grouping geotagged locations by the defined neighborhood boundary regions in R’s SF package and ArcGIS We will detail the process of identifying subway deserts in the “Subway Deserts” section, however, for the remaining predictors, the process is largely the same.
Our data has 224 observations of 38 variables. Below is a preview of our dataset with column names attached.
library(kableExtra)
kable(head(nyc_compiled, n=3)) %>% kable_styling() %>% scroll_box(width = "100%", height = "200px")
| nta_id | total_pop | mean_income | below_poverty_line_count | below_poverty_line_and_50_count | mean_rent | unemployment_count | latinx_count | white_count | black_count | native_count | asian_count | naturalized_citizen_count | noncitizen_count | uninsured_count | school_count | eviction_count | store_count | sub_count | bus_count | mean_ridership | perc_covered_by_transit | nta_type | Name | transportation_desert_4cat | borough | geometry | gini_neighborhood | asian_perc | white_perc | black_perc | latinx_perc | native_perc | noncitizen_perc | evictions_perc | uninsured_perc | unemployment_perc | below_poverty_line_perc |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BK0101 | 26308 | 98338.67 | 2397 | 1289 | 2062.667 | 582 | 3284 | 20526 | 482 | 40 | 1052 | 3777 | 3129 | 1797 | 6 | 68 | 71 | 2 | 53 | 9410.500 | 78.76390 | 0 | Greenpoint | Satisfactory | Brooklyn | MULTIPOLYGON (((-73.94074 4… | 0.4420 | 0.0399878 | 0.7802189 | 0.0183214 | 0.1248289 | 0.0015205 | 0.1189372 | 0.0025848 | 0.0683062 | 0.0221225 | 0.0911130 |
| BK0102 | 57774 | 101238.92 | 9120 | 4474 | 2330.077 | 1710 | 18227 | 32237 | 1460 | 0 | 4008 | 6802 | 7746 | 3725 | 12 | 204 | 129 | 2 | 97 | 26603.000 | 89.80852 | 0 | Williamsburg | Satisfactory | Brooklyn | MULTIPOLYGON (((-73.96355 4… | 0.4804 | 0.0693738 | 0.5579846 | 0.0252709 | 0.3154879 | 0.0000000 | 0.1340742 | 0.0035310 | 0.0644754 | 0.0295981 | 0.1578565 |
| BK0103 | 36891 | 30309.25 | 18285 | 5970 | 1194.875 | 457 | 3351 | 31799 | 1288 | 20 | 194 | 3548 | 1012 | 711 | 6 | 45 | 58 | 3 | 35 | 6348.667 | 88.13439 | 0 | South Williamsburg | Satisfactory | Brooklyn | MULTIPOLYGON (((-73.96762 4… | 0.4880 | 0.0052587 | 0.8619718 | 0.0349137 | 0.0908352 | 0.0005421 | 0.0274322 | 0.0012198 | 0.0192730 | 0.0123878 | 0.4956493 |
The following table presents a numeric summary and breakdown for our variables of interests. However, note that we present most demographic count predictors using their equivalent percent variable.
#summary(modeling_data)
library(table1)
table_print <- table1(~
total_pop + mean_income + mean_rent + gini_neighborhood +
below_poverty_line_perc + unemployment_perc + uninsured_perc +
school_count + eviction_count + store_count +
transportation_desert_4cat + sub_count + bus_count +
noncitizen_perc + white_perc + black_perc + latinx_perc + asian_perc
| borough, data = nyc_compiled %>% as_tibble()) %>% as_tibble()
colnames(table_print) <- c("Variable", "Bronx (N=44)", "Brooklyn (N=64)","Manhattan (N=39)", "Queens (N=77)", "Overall (N=224)")
table_print%>%
filter(Variable!="") %>% kable() %>% kable_styling() %>% scroll_box(width = "100%", height = "500px")
| Variable | Bronx (N=44) | Brooklyn (N=64) | Manhattan (N=39) | Queens (N=77) | Overall (N=224) |
|---|---|---|---|---|---|
| total_pop | |||||
| Mean (SD) | 30200 (18700) | 37800 (24600) | 38300 (24600) | 25900 (20900) | 32300 (22800) |
| Median [Min, Max] | 29800 [0, 69200] | 38300 [0, 97800] | 35700 [0, 95300] | 25000 [0, 87700] | 31600 [0, 97800] |
| mean_income | |||||
| Mean (SD) | 43400 (17900) | 69600 (28700) | 103000 (49700) | 74500 (14400) | 71900 (33900) |
| Median [Min, Max] | 38000 [23100, 94200] | 61200 [27400, 148000] | 108000 [33300, 212000] | 72600 [37500, 104000] | 67200 [23100, 212000] |
| Missing | 8 (18.2%) | 9 (14.1%) | 6 (15.4%) | 19 (24.7%) | 42 (18.8%) |
| mean_rent | |||||
| Mean (SD) | 1230 (197) | 1580 (465) | 1960 (674) | 1650 (198) | 1600 (465) |
| Median [Min, Max] | 1240 [833, 1620] | 1450 [792, 3280] | 2070 [884, 3270] | 1630 [1140, 2250] | 1510 [792, 3280] |
| Missing | 8 (18.2%) | 9 (14.1%) | 6 (15.4%) | 19 (24.7%) | 42 (18.8%) |
| gini_neighborhood | |||||
| Mean (SD) | 0.467 (0.0317) | 0.462 (0.0292) | 0.516 (0.0379) | 0.423 (0.0260) | 0.459 (0.0439) |
| Median [Min, Max] | 0.467 [0.407, 0.519] | 0.466 [0.382, 0.515] | 0.525 [0.436, 0.582] | 0.420 [0.358, 0.484] | 0.457 [0.358, 0.582] |
| below_poverty_line_perc | |||||
| Mean (SD) | 0.251 (0.122) | 0.197 (0.143) | 0.168 (0.129) | 0.119 (0.0832) | 0.179 (0.128) |
| Median [Min, Max] | 0.251 [0, 0.444] | 0.179 [0, 1.00] | 0.126 [0, 0.576] | 0.103 [0, 0.615] | 0.148 [0, 1.00] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
| unemployment_perc | |||||
| Mean (SD) | 0.0443 (0.0227) | 0.0462 (0.128) | 0.0294 (0.0143) | 0.0271 (0.0119) | 0.0367 (0.0709) |
| Median [Min, Max] | 0.0455 [0, 0.133] | 0.0300 [0, 1.00] | 0.0285 [0, 0.0767] | 0.0264 [0, 0.0712] | 0.0303 [0, 1.00] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
| uninsured_perc | |||||
| Mean (SD) | 0.0788 (0.0421) | 0.0722 (0.0406) | 0.0476 (0.0300) | 0.0812 (0.0512) | 0.0719 (0.0443) |
| Median [Min, Max] | 0.0876 [0, 0.245] | 0.0681 [0, 0.231] | 0.0383 [0, 0.122] | 0.0729 [0, 0.298] | 0.0681 [0, 0.298] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
| school_count | |||||
| Mean (SD) | 8.64 (7.44) | 8.16 (6.79) | 8.54 (7.79) | 4.08 (2.88) | 6.92 (6.41) |
| Median [Min, Max] | 5.50 [1.00, 27.0] | 6.00 [1.00, 31.0] | 5.00 [1.00, 28.0] | 3.00 [1.00, 12.0] | 5.00 [1.00, 31.0] |
| eviction_count | |||||
| Mean (SD) | 438 (340) | 245 (234) | 223 (233) | 126 (131) | 238 (255) |
| Median [Min, Max] | 406 [1.00, 1130] | 163 [1.00, 829] | 152 [1.00, 1120] | 93.0 [1.00, 521] | 148 [1.00, 1130] |
| store_count | |||||
| Mean (SD) | 53.2 (38.6) | 69.8 (49.5) | 58.6 (39.8) | 33.1 (33.8) | 52.0 (43.1) |
| Median [Min, Max] | 48.5 [1.00, 138] | 70.5 [1.00, 202] | 54.0 [1.00, 151] | 24.0 [1.00, 147] | 45.0 [1.00, 202] |
| transportation_desert_4cat | |||||
| Poor | 4 (9.1%) | 8 (12.5%) | 2 (5.1%) | 40 (51.9%) | 54 (24.1%) |
| Limited | 12 (27.3%) | 14 (21.9%) | 0 (0%) | 18 (23.4%) | 44 (19.6%) |
| Satisfactory | 8 (18.2%) | 12 (18.8%) | 7 (17.9%) | 9 (11.7%) | 36 (16.1%) |
| Excellent | 20 (45.5%) | 30 (46.9%) | 30 (76.9%) | 10 (13.0%) | 90 (40.2%) |
| sub_count | |||||
| Mean (SD) | 1.95 (1.33) | 2.77 (2.06) | 4.03 (3.53) | 1.55 (1.22) | 2.41 (2.23) |
| Median [Min, Max] | 1.00 [1.00, 7.00] | 2.00 [1.00, 9.00] | 3.00 [1.00, 17.0] | 1.00 [1.00, 6.00] | 1.00 [1.00, 17.0] |
| bus_count | |||||
| Mean (SD) | 40.1 (22.9) | 60.2 (41.1) | 46.6 (26.0) | 56.9 (45.0) | 52.7 (38.0) |
| Median [Min, Max] | 43.5 [1.00, 125] | 59.0 [1.00, 170] | 44.0 [2.00, 106] | 52.0 [1.00, 243] | 49.0 [1.00, 243] |
| noncitizen_perc | |||||
| Mean (SD) | 0.174 (0.0936) | 0.150 (0.132) | 0.139 (0.0506) | 0.174 (0.101) | 0.160 (0.103) |
| Median [Min, Max] | 0.169 [0, 0.581] | 0.127 [0, 1.00] | 0.133 [0, 0.242] | 0.143 [0, 0.471] | 0.143 [0, 1.00] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
| white_perc | |||||
| Mean (SD) | 0.114 (0.163) | 0.381 (0.277) | 0.454 (0.268) | 0.287 (0.247) | 0.309 (0.270) |
| Median [Min, Max] | 0.0313 [0, 0.606] | 0.385 [0, 1.00] | 0.496 [0, 0.829] | 0.247 [0, 1.00] | 0.222 [0, 1.00] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
| black_perc | |||||
| Mean (SD) | 0.301 (0.185) | 0.292 (0.308) | 0.146 (0.179) | 0.178 (0.273) | 0.231 (0.261) |
| Median [Min, Max] | 0.274 [0.0631, 0.833] | 0.152 [0, 1.00] | 0.0585 [0.00873, 0.591] | 0.0406 [0, 0.899] | 0.121 [0, 1.00] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
| latinx_perc | |||||
| Mean (SD) | 0.531 (0.197) | 0.190 (0.163) | 0.246 (0.206) | 0.255 (0.178) | 0.292 (0.221) |
| Median [Min, Max] | 0.618 [0, 0.784] | 0.143 [0, 0.808] | 0.170 [0.0608, 0.724] | 0.217 [0, 0.856] | 0.203 [0, 0.856] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
| asian_perc | |||||
| Mean (SD) | 0.0370 (0.0496) | 0.108 (0.121) | 0.127 (0.116) | 0.241 (0.190) | 0.138 (0.156) |
| Median [Min, Max] | 0.0179 [0, 0.217] | 0.0733 [0, 0.472] | 0.113 [0, 0.661] | 0.205 [0, 0.757] | 0.0777 [0, 0.757] |
| Missing | 3 (6.8%) | 6 (9.4%) | 3 (7.7%) | 15 (19.5%) | 27 (12.1%) |
From the numerical summary observe that Manhattan has the largest population counts, highest mean rental prices, highest mean income, highest income inequality (e.g. gini value), most number of neighborhoods with excellent subway access, and the greatest proportion of white citizens.
Conversely, observe that the Bronx has the lowest mean income and highest eviction counts, while having the highest proportions of people living below the poverty line, living without insurance coverage, and the highest number of people with limited subway access. Importantly, the Bronx also has the highest densities of both Latinx and Black residents of any other borough in New York, meaning that the Black and Latinx residents in New York are experiencing the burden of New York’s structural inequities.
We will touch on the connections between demographics, transportation access, and housing more explicitly in the following sections.
New York City is the most populous city in the US with more than 8.8 million people. To support the daily commutes of its residents, NYC also built the New York City Subway, the oldest, longest, and currently busiest subway system in the US, averaging approximately 5.6 million daily rides on weekdays and a combined 5.7 million rides each weekend. Compared to other US cities where automobiles are the most popular mode of transportation (ahem, Minneapolis), NYC only has about 32% of the population who choose to commute by cars, thanks to its efficient and far-reaching transit system, whereas most metro areas in the US have more than 70% of the population who chose to commute by cars.
Despite having the most extensive transit network in the entire US, NYC is still lacking in terms of transit accessibility for some neighborhoods. The general consensus in academia is that residents who walk more than 0.5 miles to get to reliable transit are considered lacking transportation access, or residing in a transportation desert. For our research, we adopted this concept to study these gaps in transportation access. Specifically, we attempt to identify and study “Subway Deserts”.
Extending the USDA’s definition of a food desert, we define subway deserts as the percentage of a neighborhood— or any arbitrary geographic area— that is within walking distance of any subway stop. Citing the U.S. Federal Highway Administration, we defined walking distance as a 0.5 mile radius and computed these regions in ArcGIS. We chose subway stations because of the subway’s reliable frequency, high connectivity between boroughs, and high ridership per vehicle. Our argument against including the number of bus stops in our calculations of transportation access is that the quantity of bus stops does not accurately imply public transport accessibility due to the variability in bus efficiency, punctuality, and use. A major limitation of our work was the omission of Staten Island because it is not connected to any other borough by subway. Rather, Staten Island users typically drive or train into the city. Further, we felt that the inclusion of Staten Island would mischaracterize the relationship between lacking access and not needing access since Staten Island is an overwhelmingly white, wealthy, borough that has high levels of car ownership (CITE).
We first geocoded subway stop locations in NYC from the NYC Department of Transportation. Then, using ArcGIS we created a 0.5-mile-radius buffer for each station and calculated what percent of each neighborhood was covered by a buffer region. We display an example below.
A nice image.
In the graph, buffer zones are in light pink with overlapping boundaries dissolved between stations, while the dark pink dots indicate the exact geographic locations of the stations. Each neighborhood, then, had a percentage score that defined it’s subway accessibility score.
Upon observation, we categorized the areas served by the subway network into four ordinal categories: Poor, Limited, Satisfactory, and Excellent. These categories are defined at 0-1%, 1-75%, 75-90%, and 90-100% of area covered by transit, respectively. We defined these cutoffs using the distribution of subway coverage percentages and our own judgment on what constitutes a desert. As such, these cutoffs are specific to New York City and may not be perfectly reproducible. The following plot details the spatial locations of these transportation categories.
nyc_compiled %>%
ggplot() +
geom_sf(aes(fill = transportation_desert_4cat), color = "#8f98aa") +
scale_fill_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"),
guide = guide_legend(title = "Subway Accessibility \nCategory"), na.value="#D6D6D6") +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Subway Accessibility by \nNeighborhood in NYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
In the following two subsections, we describe how we understood and classified transportation deserts using two models.
Naive Bayes Model is one of the most popular models for classifying a response variable with 2 or more categories.
We implemented a naive Bayes classifier on subway access because it is both computationally efficient and applicable to Bayesian classification settings where outcomes may have 2+ categories. Specifically, we fit transportation access by taking mean_income, percentage below the poverty line, and the number of grocery stores. Because we are predicting 4 levels of transportation access, we initially fit this model using the e1071 package to classify subway transit level.
library(e1071)
nyc_naive <- read_csv(here("clean_data", "nyc_names_vichy_complied2.csv"))%>%
mutate(transportation_desert_4cat = factor(transportation_desert_4cat, levels=c("Poor", "Limited", "Satisfactory", "Excellent"))) %>%
mutate(transportation_desert_4num = factor(as.numeric(transportation_desert_4cat))) %>%
mutate(asian_perc = (asian_count / total_pop) * 100) %>%
mutate(white_perc = (white_count / total_pop)* 100) %>%
mutate(black_perc = (black_count / total_pop)* 100) %>%
mutate(latinx_perc = (latinx_count / total_pop)* 100) %>%
mutate(native_perc = (native_count / total_pop)* 100) %>%
mutate(below_poverty_perc = (below_poverty_line_count / total_pop) * 100) %>%
mutate(noncitizen_perc = (noncitizen_count / total_pop)* 100) %>%
mutate(evictions_perc = (eviction_count / total_pop)* 100) %>%
mutate(uninsured_perc = (uninsured_count / total_pop)* 100) %>%
mutate(unemployment_perc = (unemployment_count / total_pop)* 100) %>%
filter(nta_type == 0)
set.seed(454)
naive_model <- naiveBayes(transportation_desert_4cat ~
mean_income +
below_poverty_perc +
store_count,
data = nyc_naive)
naive2_prediction <- naive_classification_summary_cv(naive_model,
nyc_naive,
y="transportation_desert_4cat", k=10)$cv
naive2_prediction
## transportation_desert_4cat Poor Limited Satisfactory Excellent
## Poor 78.12% (25) 12.50% (4) 0.00% (0) 9.38% (3)
## Limited 31.43% (11) 11.43% (4) 20.00% (7) 37.14% (13)
## Satisfactory 20.69% (6) 17.24% (5) 6.90% (2) 55.17% (16)
## Excellent 9.52% (8) 15.48% (13) 2.38% (2) 72.62% (61)
Under 10-fold cross validation, our Naive Bayes model had an overall cross-validated accuracy of 0.5111111%. However, our predictions were most accurate when predicting Poor transportation access (78.12%) and Excellent transportation access (72.62%). The following plot describes the cross-validated accuracy breakdown by each observed transportation access category.
library(tidyverse)
prediction <- as.data.frame(naive2_prediction) %>%
pivot_longer(
cols= Poor:Excellent,
names_to = 'Predictions',
values_to = 'Probability'
) %>%
mutate(Probability = as.numeric(str_extract(Probability,'\\d+\\.\\d+'))/100) %>%
mutate(transportation_desert_4cat = factor(transportation_desert_4cat, levels=c("Poor", "Limited", "Satisfactory", "Excellent"))) %>%
mutate(Predictions = factor(Predictions, levels=c("Poor", "Limited", "Satisfactory", "Excellent")))
prediction %>%
ggplot(aes(x=transportation_desert_4cat, y=Probability, fill=Predictions)) +
geom_bar(position="fill", stat="identity") +
scale_y_continuous(labels = seq(0, 100, by = 25)) +
labs(title="Accessibility Predictions by Observed Category", y="Proportion", x="")+
theme(panel.grid.major.x = element_line("transparent"),
# axis.text.y.left = element_blank(),
axis.text.x.bottom = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 16, hjust=.5, face = "bold")) +
scale_fill_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"),
guide = guide_legend(title = "Subway Accessibility \nCategory"), na.value="#D6D6D6")
From the plot, it is clear that our naive Bayes model is sufficient when predicting the extrema of subway (in)access given the overwhelming proportion of true-poor and true-excellent classifications. However, it remains imperfect when considering the inaccuracy for both the limited and satisfactory transportation categories, our data’s distributions, and its interpretability.
Importantly, naive Bayes assumes that all quantitative predictors are normally distributed within each Y category and further it assumes (i.e. that predictors are independent within each Y category). Our data do not meet these assumptions, unfortunately. Further, naive Bayes is a blackbox classifier. That is, naive Bayes classification might give us accurate predictions, but it doesn’t give us a sense of where these predictions come from or subway access is related to the predictors.
Having realized the shortcomings of the naive Bayes model, we wanted to see if there are any alternatives. We land on the ordinal regression model.
library(rsample)
set.seed(454)
nyc_compiled_classify <- nyc_compiled %>%
mutate(transportation_desert_4cat = factor(transportation_desert_4cat, levels=c("Poor", "Limited", "Satisfactory", "Excellent"))) %>%
mutate(transportation_desert_4num = factor(as.numeric(transportation_desert_4cat))) %>%
mutate(asian_perc = (asian_count / total_pop) * 100) %>%
mutate(white_perc = (white_count / total_pop)* 100) %>%
mutate(black_perc = (black_count / total_pop)* 100) %>%
mutate(latinx_perc = (latinx_count / total_pop)* 100) %>%
mutate(native_perc = (native_count / total_pop)* 100) %>%
mutate(below_poverty_perc = (below_poverty_line_count / total_pop) * 100) %>%
mutate(noncitizen_perc = (noncitizen_count / total_pop)* 100) %>%
mutate(uninsured_perc = (uninsured_count / total_pop)* 100) %>%
mutate(unemployment_perc = (unemployment_count / total_pop)* 100) %>%
filter(nta_type == 0)
data_split <- initial_split(nyc_compiled_classify, prop = .8)
data_train <- training(data_split)
data_test <- testing(data_split)
model2 <- stan_polr(transportation_desert_4num ~
mean_income + below_poverty_line_perc + store_count,
data =data_train, prior_counts = rstanarm::dirichlet(1),
prior=R2(0.5), iter=500, seed = 86437, refresh=0, prior_PD=FALSE)
tidy(model2, effects = "fixed", conf.int = TRUE, conf.level = 0.8) %>%
mutate(term = case_when(
term == "1|2" ~ "Poor | Limited",
term == "2|3" ~ "Limited | Satisfactory",
term == "3|4" ~ "Satisfactory | Excellent",
TRUE ~ term
))%>%
kable(align = "c", caption = "Ordinal Model - Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| mean_income | 0.0000460 | 0.0000104 | 0.0000339 | 0.0000596 |
| below_poverty_line_perc | 17.0648982 | 3.4532734 | 12.8319749 | 22.3269332 |
| store_count | 0.0189996 | 0.0052394 | 0.0126227 | 0.0254789 |
| Poor | Limited | 5.4113154 | 1.2620341 | 3.9385520 | 7.1398347 |
| Limited | Satisfactory | 6.7324319 | 1.3225479 | 5.2493699 | 8.5003721 |
| Satisfactory | Excellent | 7.7039327 | 1.3435146 | 6.1613497 | 9.5624926 |
Then using a function written by Connie Zhang’s, we describe the accuracy of the ordinal model below.
ordinal_accuracy<-function(post_preds,mydata){
post_preds<-as.data.frame(post_preds)
results<-c()
for (j in (1:length(post_preds))){
results[j]<-as.numeric(tail(names(sort(table(post_preds[,j]))))[4])
}
results<-as.data.frame(results)
compare<-cbind(results,mydata$transportation_desert_4num)
compare<-compare %>%mutate(results=as.numeric(results))
compare<-compare %>% mutate(`mydata$transportation_desert_4num`=as.numeric(`mydata$transportation_desert_4num`))
compare<-compare %>%mutate(accuracy=ifelse(as.numeric(results)==as.numeric(`mydata$transportation_desert_4num`),1,0))
print(sum(compare$accuracy)/length(post_preds))
}
nyc_compiled[is.na(nyc_compiled)] = 0
set.seed(86437)
my_prediction2 <- posterior_predict(
model2,
newdata = data_test)
ordinal_accuracy(my_prediction2, data_test)
## [1] 0.6388889
my_prediction2 <- posterior_predict(
model2,
newdata = data_test)
prediction_long <- my_prediction2 %>%
t() %>%
as.tibble() %>%
mutate_if(is.character, as.numeric) %>%
rownames_to_column() %>%
rowwise(id=rowname) %>%
summarize(median=ifelse(mean(c_across(where(is.numeric)))>3.5, ceiling(mean(c_across(where(is.numeric)))), floor(mean(c_across(where(is.numeric)))))) %>%
rename(predicted_desert = median)%>%
mutate(predicted_desert = case_when(
predicted_desert==1 ~ "Poor",
predicted_desert ==2 ~ "Limited",
predicted_desert ==3 ~ "Satisfactory",
TRUE ~ "Excellent",
)) %>%
mutate(predicted_desert = factor(predicted_desert, levels=c("Poor", "Limited", "Satisfactory", "Excellent")))
data_test %>%
select(nta_id, borough, transportation_desert_4cat) %>%
rownames_to_column() %>%
left_join(., prediction_long, by="rowname") %>%
ggplot(aes(x=transportation_desert_4cat, fill=predicted_desert)) +
geom_bar(position="fill")+
scale_y_continuous(labels = seq(0, 100, by = 25)) +
labs(title="Accessibility Predictions by Observed Category", y="Proportion", x="")+
theme(panel.grid.major.x = element_line("transparent"),
# axis.text.y.left = element_blank(),
axis.text.x.bottom = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold")) +
scale_fill_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"),
guide = guide_legend(title = "Subway Accessibility \nCategory"), na.value="#D6D6D6")
Transportation access is a pervasive structural issue. However, previous research on transportation access has demonstrated that many of these gaps in access also deepen the chasms of racial and class-based inequity. In this NYC analysis, it seemed that black communities were most at risk for being in transportation deserts (CITE). Further, in our own data we previously observed that predominantly Black and Latinx neighborhoods in the Bronx faced some of the highest rates of eviction.
This next section aims to connect transportation access to housing-inequities that we know also have racial and class dimensions. In particular, we wanted to assess transportation access’s relationship with immigrant community size, rental prices, and eviction counts by neighborhood. We felt that this was an appropriate direction because as you can see from the plots below:
desert_map <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = transportation_desert_4cat), color = "#8f98aa") +
scale_fill_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"),
guide = guide_legend(title = "Subway Accessibility \nCategory"), na.value="#D6D6D6") +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Subway Access in \nNYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 30, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
rent_map <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = mean_rent), color = "#8f98aa") +
scale_fill_gradient(low = "#FCF5EE", high = "#30969C",
guide = guide_legend(title = "Dollars")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Mean Rent in \nNYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
evict <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = eviction_count), color = "#8f98aa")+
scale_fill_gradient(low = "#FCF5EE", high = "#C47572",
guide = guide_legend(title = "Number of Evictions")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Eviction Counts in \nNYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 26, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
noncit <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = noncitizen_perc), color = "#8f98aa") +
scale_fill_gradient(low = "#FCF5EE", high = "#5E76AD",
guide = guide_legend(title = "Percent Non-Citizen")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Immigrant Density in \nNYC")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
library(egg)
ggarrange(desert_map, rent_map, evict, noncit,
ncol=2, nrow=2)
Transportation access is typically worst in areas with the highest densities of non-citizen residents, while it is the best in neighborhoods with the highest mean rental prices. Further, observe that eviction counts are highly concentrated in north and south NYC, where some of these neighborhoods have mixed-accessibility to transit.
Unfortunately, rent, transportation access, and eviction counts may also be associated with respective density of nonwhite communities.
white <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = white_perc), color = "#8f98aa") +
scale_fill_gradientn(colors = c("#FCF5EE","#919BB6", "#5A6687", "#3D465C"), guide = guide_legend(title = "Number White")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("White Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
black <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = black_perc), color = "#8f98aa") +
scale_fill_gradientn(colors = c("#FCF5EE","#F8ABA6", "#F58581", "#DB3F37"), guide = guide_legend(title = "Number Black")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Black Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
asian <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = asian_perc), color = "#8f98aa") +
scale_fill_gradientn(colors = c("#FCF5EE","#C1C5EC", "#A1A8E2", "#4451C5"), guide = guide_legend(title = "Number Asian")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Asian Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
latinx <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot() +
geom_sf(aes(fill = latinx_perc), color = "#8f98aa")+
scale_fill_gradientn(colors = c("#FCF5EE","#F4E2B8", "#E9C572", "#E77E2F"),
guide = guide_legend(title = "Number Latinx")) +
theme_minimal() +
theme(panel.grid.major = element_line("transparent"),
axis.text = element_blank()) +
ggtitle("Latinx Population")+
theme(panel.grid.major = element_line("transparent"),
plot.title = element_text(size = 24, face = "bold"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12)) +
guides(shape = guide_legend(override.aes = list(size = 8)),
color = guide_legend(override.aes = list(size = 8)))
ggarrange(desert_map, white, black, latinx, asian, ncol=3, nrow=2)
From the plots, neighborhoods with the highest densities of Black and Asian community members also have the poorest scores of subway access, while the converse holds true for neighborhoods with the highest proportion of White residents.
Further, observe that eviction counts are most common in neighborhoods with the highest densities of Black and Latinx community members. The below faceted visualizations detail the specific relationships between the proportion of Black, Latinx, Asian, and White residents in a neighborhood with eviction counts.
black_eviction <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=black_perc,
color=transportation_desert_4cat,
y=eviction_count)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Eviction Counts by Black Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
asian_eviction <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=asian_perc,
color=transportation_desert_4cat,
y=eviction_count)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Eviction Counts by Asian Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
latinx_eviction <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=latinx_perc,
color=transportation_desert_4cat,
y=eviction_count)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Eviction Counts by Latinx Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
white_eviction <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=white_perc,
color=transportation_desert_4cat,
y=eviction_count)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Eviction Counts by White Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
ggarrange(black_eviction, latinx_eviction, asian_eviction, white_eviction, ncol=2, nrow=2)
Black and Latinx residential proportions by neighborhood are associated with increased eviction counts. However, we must specify that Black resident proportions are uniformly associated with increases in eviction counts across transportation levels, while the relationship between eviction counts and Latinx resident proportion is not. In contrast, both White and Asian proportions are uniformly associated with decreases in eviction counts.
Lastly, let’s consider mean neighborhood rental prices. The following visualizations detail the relationships between the proportion of Black, Latinx, Asian, and White residents in a neighborhood with that neighborhood mean rental price.
black_rent <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=black_perc,
color=transportation_desert_4cat,
y=mean_rent)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Neighborhood Rental Price Averages \nby Black Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
asian_rent <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=asian_perc,
color=transportation_desert_4cat,
y=mean_rent)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Neighborhood Rental Price Averages \nby Asian Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
latinx_rent <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=latinx_perc,
color=transportation_desert_4cat,
y=mean_rent)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Neighborhood Rental Price Averages \nby Latinx Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
white_rent <- nyc_compiled %>%
filter(nta_type == 0) %>%
ggplot(aes(x=white_perc,
color=transportation_desert_4cat,
y=mean_rent)) +
geom_point(size=2)+
geom_smooth(method="lm", size=3, se=F)+
labs(title="Neighborhood Rental Price Averages \nby White Density per Transportation Category", y="", y="Black (%)")+
theme_linedraw()+
theme(legend.position = "none",
# axis.text.y.left = element_blank(),
# axis.text.x.bottom = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16,hjust=.5, face = "bold"),
strip.background.x = element_rect(fill="Black", color="Black"),
strip.text.x = element_text(size = 14,
color = "White",
face = "bold")) +
scale_color_manual(values=c("#895F32","#E9DBC2","#7D9B8A", "#395645"))+ facet_wrap(.~transportation_desert_4cat, scales = "free")
ggarrange(black_rent, latinx_rent, asian_rent, white_rent, ncol=2, nrow=2)
Increases in White-resident proportions were uniformly associated with increases in average neighborhood rental prices across all transportation access categories. The converse is true for the relationship between Black and Latinx neighborhood densities and rental prices. It then seems that despite living in cheaper neighborhoods, both NYC’s Black and Latinx communities are carrying the largest burden of eviction.
Our next section details the statistical models we used to better characterize the clear relationships between housing inequities, urban racism, and transportation access.
In order to understand the respective distributions of immigrant population size, evictions, and mean rental prices in NYC, we fit 3 non-hierarchical Bayesian models with each variable as an outcome. In previous iterations, we fit hierarchical models on these same relationships where neighborhood values were grouped by borough. We initially felt that the internal diversity of boroughs with respect to housing, demographics, and wealth made it more appropriate as a grouping variable. However, we ultimately saw that the non-hierarchical models we fit were largely comparable to their hierarchical counterparts. Further, we considered how even though there is a dramatic internal diversity in boroughs their effects on our outcomes were still sufficiently meaningful to warrant a true statistical adjustment.
We list our non-hierarchical models and their predictors below:
transportation_desert_4cat, total_pop, borough, gini_neighborhood, mean_income, mean_rent, unemployment_perc, black_perc, latinx_perc, asian_perctransportation_desert_4cat, borough, gini_neighborhood, mean_income, black_perc, latinx_perc, asian_perc, bus_count,school_count,store_count, noncitizen_perctransportation_desert_4cat, total_pop, borough, gini_neighborhood, mean_income, mean_rent, black_perc, latinx_perc, asian_perc, bus_count,store_count,unemployment_perc, uninsured_percFor 1 and 3, we used weakly informative priors and allowed stan_glm to estimate initial priors. This decision was ultimately due to our unfamiliarity with NYC’s history of evictions and non-citizen population. For Model 2, we specified a normal prior with mean 1600 and standard deviation at 20 using previous experience as renters both in NYC and other comparable major cities. Across all count models, we fit both Poisson and Negative-Binomial prior distributions for the observed eviction and non-citizen counts. However, we observed an inconsistent spread of variance and increased 0 counts in both the eviction and immigrant count data. As such, we felt that a Negative-Binomial distribution was more appropriate for both data.
Our model specifications are detailed in the following subsections
modeling_data <- nyc_compiled %>%
mutate(black_perc = black_perc * 10) %>%
mutate(white_perc = white_perc * 10) %>%
mutate(latinx_perc = latinx_perc * 10) %>%
mutate(asian_perc = asian_perc * 10) %>%
mutate(native_perc = native_perc * 10) %>%
mutate(unemployment_perc = unemployment_perc * 5) %>%
mutate(uninsured_perc = uninsured_perc * 5) %>%
mutate(mean_income = mean_income / 100) %>%
mutate(mean_rent = mean_rent / 100) %>%
filter(nta_type==0)
noncit_model <- stan_glm(
noncitizen_count ~
transportation_desert_4cat +
total_pop + borough + gini_neighborhood +
mean_income + mean_rent + unemployment_perc +
black_perc + latinx_perc + asian_perc,
data = modeling_data,
family = neg_binomial_2,
chains = 5, iter = 2000*2, seed = 84735, refresh = 0
)
\[\begin{split} \text{Non-Citizen Count} \mid \beta_{0c}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu, r) \; \; \; \; \text{where} \log(\mu) = \beta_{0c} + \sum^{14}_{k=1}X_{k}\beta_k \\ \beta_{0c} &\sim N(0,2.5^2)\\ \beta_{1} &\sim N(0,6.2785^2)\\ \beta_{2} &\sim N(0,6.7918^2)\\ \beta_{3} &\sim N(0,5.0879^2)\\ \beta_{4} &\sim N(0, 0.0001^2)\\ \beta_{5} &\sim N(0,5.5216^2)\\ \beta_{6} &\sim N(0,6.5781^2)\\ \beta_{7} &\sim N(0,5.2519^2)\\ \beta_{8} &\sim N(0,56.96^2)\\ \beta_{9} &\sim N(0,0.006^2)\\ \beta_{10} &\sim N(0,0.3317^2)\\ \beta_{11} &\sim N(0,7.3986^2)\\ \beta_{12} &\sim N(0,0.9779^2)\\ \beta_{13} &\sim N(0,1.0952^2)\\ \beta_{14} &\sim N(0,1.638^2)\\ r & \sim Exp(1) \\ \end{split}\]
rent_model <- stan_glm(
mean_rent ~
transportation_desert_4cat +
bus_count + school_count + store_count +
total_pop + borough +
gini_neighborhood + mean_income + black_perc + latinx_perc + asian_perc,
data = modeling_data,
family = gaussian,
prior_intercept = normal(1600 , 20),
chains = 5, iter = 2000*2, seed = 84735, refresh = 0
)
\[\begin{split} \text{Mean Rent} \mid \beta_{0c}, \beta_1, ..., \beta_k, r & \sim \text{Normal}(\mu, \sigma) \; \; \; \; \text{where} \mu = \beta_{0c} + \sum^{14}_{k=1}X_{k}\beta_k \\ \beta_ {0c} &\sim N(1600,20^2)\\ \beta_{1} &\sim N(0,47.315^2)\\ \beta_{2} &\sim N(0,51.1837^2)\\ \beta_{3} &\sim N(0,38.3432^2)\\ \beta_{4} &\sim N(0,0.4958^2)\\ \beta_{5} &\sim N(0,2.9378^2)\\ \beta_{6} &\sim N(0,0.4371^2)\\ \beta_{7} &\sim N(0,0.0008^2)\\ \beta_{8} &\sim N(0,41.6113^2)\\ \beta_{9} &\sim N(0,49.5728^2)\\ \beta_{10} &\sim N(0,39.5783^2)\\ \beta_{11} &\sim N(0,429.2544^2)\\ \beta_{12} &\sim N(0,0.0454^2)\\ \beta_{13} &\sim N(0,7.3695^2)\\ \beta_{14} &\sim N(0,8.2532^2)\\ \beta_{15} &\sim N(0,12.3441^2)\\ \sigma & \sim Exp(0.13) \\ \end{split}\]
We chose our prior specifications of mean rental price using Juthi’s experience renting in NYC and a group conversation about typical rental prices we would elect to pay in NYC, Los Angeles, and other major cities we have lived in or around.
eviction_model <- stan_glm(
eviction_count ~
transportation_desert_4cat +
total_pop + borough +
gini_neighborhood + mean_income + mean_rent +
black_perc + latinx_perc + asian_perc,
data = modeling_data,
family = neg_binomial_2,
chains = 5, iter = 2000*2, seed = 84735, refresh = 0
)
\[\begin{split} \text{Eviction Count} \mid \beta_{0c}, \beta_1, ..., \beta_k, r & \sim \text{NegBin}(\mu, r) \; \; \; \; \text{where} \log(\mu) = \beta_{0c} + \sum^{14}_{k=1}X_{k}\beta_k \\ \beta_{0c} &\sim N(0,2.5^2)\\ \beta_{1} &\sim N(0,6.2992^2)\\ \beta_{2} &\sim N(0,6.7813^2)\\ \beta_{3} &\sim N(0,4.9972^2)\\ \beta_{4} &\sim N(0,1e-04^2)\\ \beta_{5} &\sim N(0,5.4697^2)\\ \beta_{6} &\sim N(0,6.443^2)\\ \beta_{7} &\sim N(0,5.3347^2)\\ \beta_{8} &\sim N(0,56.9002^2)\\ \beta_{9} &\sim N(0,0.0074^2)\\ \beta_{10} &\sim N(0,0.5699^2)\\ \beta_{11} &\sim N(0,0.993^2)\\ \beta_{12} &\sim N(0,1.142^2)\\ \beta_{13} &\sim N(0,1.6003^2)\\ r & \sim Exp(1) \\ \end{split}\]
In the following sections, we go through each particular model’s outcome and what they tell us about the relationships between transportation and housing.
pp_check(noncit_model)+
xlab("Non-Citizen Count") +
labs(title = "Negative Binomial Model of \nImmigrant/Non-Citizen Count")+
theme(plot.title = element_text(face="bold", size=20, hjust=.5))
non_citizen_clean <- modeling_data %>%
na.omit()%>%
arrange(noncitizen_count)
list <- non_citizen_clean %>%
arrange(noncitizen_count) %>% pull(noncitizen_count)
set.seed(84735)
predictions_non_citizen <- posterior_predict(
noncit_model, newdata = non_citizen_clean)
ppc_intervals(list, yrep = predictions_non_citizen,
prob_outer = 0.8) +
ggplot2::scale_x_continuous(
labels = non_citizen_clean$nta_id,
breaks = 1:nrow(non_citizen_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw()+
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
prediction_summary_cv(model = noncit_model, data=modeling_data, k=10)$cv %>%
kable(align = "c", caption = "Non-Citizen Model - Cross-Validated Error Metrics") %>%
kable_styling()
| mae | mae_scaled | within_50 | within_95 |
|---|---|---|---|
| 2799.686 | 2.424508 | 0.1055556 | 0.5055556 |
Under 10-fold cross-validation, we can see how our model performs when predicting new, randomly-assorted, non-citizen counts. In this case, it seems that the 10-fold CV median absolute error (MAE) is 2704. (2.38 sd) meaning that the typical difference between the averages of our posterior prediction distributions is 2704 people or 2.37 standard deviations away from the observed number of non-citizen counts. Beyond accuracy metrics, it also seems that only 10.55% and 51.67% of the observed non-citizen counts are falling within their 50% and 95% posterior prediction intervals. Together, this indicates our model’s performance when predicting non-citizen counts may need further work.
library(kableExtra)
tidy(noncit_model, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Non-Citizen Model - Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 646.2336556 | 0.5796958 | 306.0856641 | 1394.6080720 |
| transportation_desert_4catLimited | 21.8211134 | 0.0968957 | 7.4802317 | 38.2773377 |
| transportation_desert_4catSatisfactory | 30.2269405 | 0.1038534 | 13.9913248 | 49.0942148 |
| transportation_desert_4catExcellent | 29.8440521 | 0.1023578 | 13.3463433 | 48.5186263 |
| total_pop | 0.0025371 | 0.0000017 | 0.0023216 | 0.0027461 |
| boroughBrooklyn | 16.6136923 | 0.1079475 | 1.6899821 | 34.0724482 |
| boroughManhattan | 20.5835833 | 0.1297466 | 2.2686502 | 42.8453923 |
| mean_income | -0.0793199 | 0.0002426 | -0.1102853 | -0.0476387 |
| mean_rent | 6.2938700 | 0.0171338 | 3.9675372 | 8.6886472 |
| black_perc | 5.2070100 | 0.0163627 | 2.9904044 | 7.4540175 |
| latinx_perc | 13.4606683 | 0.0242572 | 10.0812702 | 17.0711789 |
| asian_perc | 19.6300153 | 0.0258335 | 15.7585721 | 23.6996981 |
Limited Subway Access: When controlling for all other predictors, a neighborhood with limited transit access is expected to have approximately 21.82% more non-citizens than a neighborhood with poor transit access. There’s an 80% probability that this increase could lie anywhere between (7.48%, 38.27%) non citizen residents, indicating that neighborhoods with limited transit access almost certainly have more non citizen residents than neighborhoods with poor access.
Satisfactory Subway Access: When controlling for all other predictors, a neighborhood with satisfactory transit access is expected to have approximately 30.22% more non-citizens than a neighborhood with poor transit access. There’s an 80% probability that this increase could lie anywhere between (13.99%, 49.09%) non citizen residents, indicating that neighborhoods with satisfactory transit access almost certainly have more non citizen residents than neighborhoods with poor access.
Excellent Subway Access: When controlling for all other predictors, a neighborhood with excellent transit access is expected to have approximately 29.84% more non-citizens than a neighborhood with poor transit access. There’s an 80% probability that this increase could lie anywhere between (13.34%, 48.51%) non citizen residents, indicating that neighborhoods with excellent transit access almost certainly have more non citizen residents than neighborhoods with poor access.
Brooklyn: When controlling for all other predictors, a neighborhood in Brooklyn is expected to have approximately 16.61% more non-citizens than a neighborhood in the Bronx. There’s an 80% probability that this increase could lie anywhere between (1.68%, 34.07) non citizen residents, indicating that neighborhoods in Brooklyn almost certainly have more non citizen residents than the Bronx, but the magnitude of this increase may vary.
Manhattan: When controlling for all other predictors, a neighborhood in Manhattan is expected to have approximately 20.58% more non-citizens than in the Bronx. There’s an 80% probability that this increase could lie anywhere between (2.268, 42.85) non citizen residents, indicating that neighborhoods in Manhattan almost certainly have more non citizen residents than the Bronx.
Mean Income: When controlling for all other predictors, a 100 dollar increase in mean neighborhood income is associated with approximately a 0.0793% decrease in non citizen count. However, there is a 80% chance that the decrease in non citizen count may be any value between (0.1102, 0.0476), indicating that there is almost certainly a negative relationship between mean income and non citizen count, but its magnitude may vary.
Mean Rent: When controlling for all other predictors, a 100 dollar increase in mean neighborhood rent is associated with approximately a 6.29% increase in non citizen counts. However, there is an 80% chance that the increase in non citizen count may be any value between (3.96%, 8.68%), indicating that there is almost certainly a positive relationship between mean rent and non citizen count, but its magnitude may vary.
Black Percentage: When controlling for all other predictors, a 10% increase in the Black population in a neighborhood is associated with approximately a 5.20% increase in non citizen counts. However, there is an 80% chance that the increase in non citizen count may be any value between (2.99%, 7.45%), indicating that there is almost certainly a positive relationship between Black resident percentage and non citizen count, but its magnitude may vary.
Latinx Percentage: When controlling for all other predictors, a 10% increase in the Latinx population in a neighborhood is associated with approximately a 13.46% increase in non citizen count. However, there is an 80% chance that the increase in non citizen count may be any value between (10.08%, 17.07%), indicating that there is almost certainly a positive relationship between Latinx resident percentage and non citizen count, but its magnitude may vary.
Asian Percentage: When controlling for all other predictors, a 10% increase in the Asian population in a neighborhood is associated with approximately a 19.63% increase in non citizen count. However, there is an 80% chance that the increase in non citizen count may be any value between (15.75%, 23.69%), indicating that there is almost certainly a positive relationship between Asian resident percentage and non citizen count, but its magnitude may vary.
pp_check(rent_model)+
xlab("Mean Rental Price") +
labs(title = "Normal Model of \nMonthly Mean Rental Price")+
theme(plot.title = element_text(face="bold", size=20, hjust=.5))
rent_clean <- modeling_data %>%
na.omit()%>%
arrange(mean_rent)
list <- rent_clean %>%
arrange(mean_rent) %>% pull(mean_rent)
set.seed(84735)
predictions_rent <- posterior_predict(
rent_model, newdata = rent_clean)
ppc_intervals(list,
yrep = predictions_rent,
prob_outer = 0.8) +
scale_x_continuous(
labels = rent_clean$nta_id,
breaks = 1:nrow(rent_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw() +
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
prediction_summary_cv(model = rent_model, data=nyc_compiled, k=10)$cv %>%
kable(align = "c", caption = "Mean Rent Model - Cross-Validated Error Metrics") %>%
kable_styling()
| mae | mae_scaled | within_50 | within_95 |
|---|---|---|---|
| 117.354 | 0.6569101 | 0.4907115 | 0.7725296 |
Under 10-fold cross-validation, we can see how our model performs when predicting new, randomly-assorted, mean rental prices. In this case, it seems that the 10-fold CV median absolute error (MAE) is 119.256 (0.68 sd) meaning that the typical difference between the averages of our posterior prediction distributions is 119.25 dollars or .68 standard deviations away from the observed mean rental price. Beyond accuracy metrics, it also seems that only 49.55% and 77.23% of the observed mean rental price estimates are falling within their 50% and 95% posterior prediction intervals, respectively. That’s great! Together, this indicates our model’s performance when predicting mean rental prices by neighborhood is superb.
library(kableExtra)
tidy(rent_model, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Rent Model - Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 8.0899937 | 2.6195905 | 4.7672539 | 11.4222283 |
| store_count | 0.0104850 | 0.0059110 | 0.0030623 | 0.0180846 |
| mean_income | 0.0120611 | 0.0006062 | 0.0112967 | 0.0128377 |
| black_perc | -0.1022345 | 0.0736815 | -0.1953202 | -0.0091913 |
| asian_perc | 0.2003005 | 0.1129153 | 0.0595393 | 0.3482055 |
For rent model:
Store Count: When controlling for all other predictors, as grocery store and food vendor counts in a neighborhood increase by 1, rent increases by approximately $1.049 dollars. However, there is a 80% chance that the increase associated with store count may be any value between (0.307, 1.799) dollars, indicating that there is almost certainly a positive relationship between store count and rent, but its magnitude may vary slightly.
Mean Income: When controlling for all other predictors, a 100 dollar increase in mean neighborhood income is associated with approximately a $1.20 increase in rent. However, there is a 80% chance that the increase associated with rental price increases may be any value between (1.129, 1.284) dollars, indicating that there is almost certainly a positive relationship between mean income and mean rental prices, but its magnitude may vary slightly.
Black Percentage: When controlling for all other predictors, a 10% increase in the black proportion of a neighborhood is associated with approximately a $10.20 decrease in rent. However, there is an 80% chance that the decrease associated with rent may be any value between (19.36, 0.685) dollars, indicating that there is almost certainly a negative relationship between black resident percentage and rent, but its magnitude may vary.
Asian Percentage: When controlling for all other predictors, a 10% increase in the Asian population in a neighborhood is associated with approximately a $20.52 increase in rent. However, there is an 80% chance that the increase associated with rent may be any value between (5.763, 34.87), indicating that there is almost certainly a positive relationship between Asian resident percentage and rent, but its magnitude may vary greatly.
pp_check(eviction_model)+
xlab("Eviction Count") +
xlim(0,2000)+
labs(title = "Negative Binomial Model of \nEviction Count")+
theme(plot.title = element_text(face="bold", size=20, hjust=.5))
eviction_clean <- modeling_data %>%
na.omit()%>%
arrange(eviction_count)
list <- eviction_clean %>%
arrange(eviction_count) %>% pull(eviction_count)
set.seed(84735)
predictions_eviction <- posterior_predict(
eviction_model, newdata = eviction_clean)
ppc_intervals(list,
yrep = predictions_eviction,
prob_outer = 0.8) +
scale_x_continuous(
labels = eviction_clean$nta_id,
breaks = 1:nrow(eviction_clean)) +
xaxis_text(angle = 90, hjust = 1) +
theme_linedraw() +
theme(panel.grid.major = element_line("transparent"),
axis.title.x = element_blank(),
axis.text.x = element_blank())
prediction_summary_cv(model = eviction_model, data=nyc_compiled, k=10)$cv %>%
kable(align = "c", caption = "Eviction Model - Cross-Validated Error Metrics") %>%
kable_styling()
| mae | mae_scaled | within_50 | within_95 |
|---|---|---|---|
| 66.95797 | 1.552947 | 0.2592885 | 0.5802372 |
Under 10-fold cross-validation, we can see how our model performs when predicting new, randomly-assorted, eviction counts. In this case, it seems that the 10-fold CV median absolute error (MAE) is 63.139 (1.56 sd) meaning that the typical difference between the averages of our posterior prediction distributions is 63 counts or 1.56 standard deviations away from the observed number of eviction counts. Beyond accuracy metrics, it also seems that only 25.89% and 58.92% of the observed eviction counts are falling within their 50% and 95% posterior prediction intervals. Together, this indicates our model’s performance when predicting eviction counts may need further work.
library(kableExtra)
tidy(eviction_model, effects = "fixed", conf.int = TRUE, conf.level = 0.8)%>%
mutate(estimate= ifelse(term == "(Intercept)", exp(estimate), (exp(estimate)-1)*100),
conf.low= ifelse(term == "(Intercept)", exp(conf.low), (exp(conf.low)-1)*100),
conf.high = ifelse(term == "(Intercept)", exp(conf.high), (exp(conf.high)-1)*100))%>%
filter(conf.low > 0 & conf.high > 0 | conf.low < 0 & conf.high < 0) %>%
kable(align = "c", caption = "Eviction Model - Summary") %>%
kable_styling()
| term | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| (Intercept) | 25.1153852 | 0.6640682 | 10.7108552 | 58.3015048 |
| transportation_desert_4catLimited | 39.8142056 | 0.1117065 | 21.1074597 | 61.5205875 |
| transportation_desert_4catSatisfactory | 41.2507768 | 0.1171572 | 21.8925348 | 64.3921302 |
| transportation_desert_4catExcellent | 40.5846940 | 0.1181260 | 20.6687118 | 63.6290625 |
| total_pop | 0.0022473 | 0.0000019 | 0.0020083 | 0.0024903 |
| boroughBrooklyn | -39.6044331 | 0.1222681 | -48.4523352 | -29.2619633 |
| boroughManhattan | -19.2391339 | 0.1447390 | -32.7647680 | -2.3682836 |
| boroughQueens | -28.8494627 | 0.1230540 | -39.5014175 | -16.6339924 |
| gini_neighborhood | 658.5249924 | 1.1780131 | 73.7386748 | 3243.9315409 |
| mean_income | -0.1182042 | 0.0002976 | -0.1553186 | -0.0799461 |
| mean_rent | 4.5479523 | 0.0192714 | 1.9355581 | 7.1291966 |
| black_perc | 17.7731246 | 0.0189693 | 15.0288160 | 20.6738937 |
| latinx_perc | 6.6045671 | 0.0289748 | 2.7257152 | 10.6945575 |
| asian_perc | -4.2067980 | 0.0310719 | -7.8802878 | -0.2344382 |
After removing predictors whose 80% credible intervals included the possibility of non-effect when controlling for other covariates, we found that there were 13 remaining predictors of an arbitrary neighborhood’s eviction counts. In the following list, categorical predictors’ classes were listed together:
In the following section, we interpret each predictor. However, we included total population to control for eviction count’s incidence relative to neighborhood population. As such, we will not interpret its specific \(\beta\) value.
Next, we interpret each predictor:
We’ve confirmed that even after adjusting for income, rental prices, income inequality, and borough differences, neighborhoods in NYC with a substantial proportion of Black and Latinx residents are experiencing dramatic increased risks of eviction. Interestingly, when accounting for economic and demographic predictors, the relationships with transportation we expected to see were reversed. That is, increased transportation access was associated with more evictions. This could be related to the increased population sizes in areas with the best access to transportation (e.g. Manhattan) or it could be because the disparities that transportation inaccess reflects (e.g. racial and class-based tensions) have been accounted for. However, our results are almost certainly confounded by the spatial relationships between neighborhoods and the effect of other unmeasured structural predictors in housing equity such as a neighborhood’s rent control policies or the reasons behind eviction.